Análisis de https://www.nature.com/articles/srep00196.pdf
Podemos usar read_lines_chunked si el archivo original es grande. En este ejemplo, filtramos las recetas East Asian:
library(tidyverse)
limpiar <- function(lineas,...){
str_split(lineas, ',') %>%
keep(~.x[1] == 'EastAsian') %>%
map(~.x[-1]) %>% # quitar tipo de cocina
map(~.x[nchar(.x) > 0]) # quitar elementos vacios
}
callback_limpiar <- ListCallback$new(limpiar)
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
skip = 1, callback = callback_limpiar, chunk_size = 1000)
|========== | 13%
|============ | 15%
|============= | 16%
|============= | 18% 1 MB
|============== | 20% 1 MB
|================ | 22% 1 MB
|================= | 24% 1 MB
|================== | 25% 1 MB
|==================== | 27% 1 MB
|===================== | 29% 1 MB
|====================== | 31% 1 MB
|======================= | 32% 1 MB
|========================= | 34% 1 MB
|========================== | 36% 2 MB
|=========================== | 37% 2 MB
|============================ | 39% 2 MB
|============================== | 41% 2 MB
|=============================== | 43% 2 MB
|================================ | 44% 2 MB
|================================= | 46% 2 MB
|=================================== | 48% 2 MB
|==================================== | 49% 2 MB
|===================================== | 51% 2 MB
|====================================== | 53% 2 MB
|======================================== | 55% 3 MB
|========================================= | 56% 3 MB
|========================================== | 58% 3 MB
|=========================================== | 60% 3 MB
|============================================= | 61% 3 MB
|============================================== | 63% 3 MB
|=============================================== | 65% 3 MB
|================================================ | 67% 3 MB
|================================================== | 68% 3 MB
|=================================================== | 70% 3 MB
|==================================================== | 72% 4 MB
|===================================================== | 73% 4 MB
|======================================================= | 75% 4 MB
|======================================================== | 77% 4 MB
|========================================================= | 78% 4 MB
|========================================================== | 80% 4 MB
|============================================================ | 82% 4 MB
|============================================================= | 84% 4 MB
|============================================================== | 85% 4 MB
|================================================================ | 87% 4 MB
|================================================================= | 89% 5 MB
|================================================================== | 91% 5 MB
|==================================================================== | 93% 5 MB
|===================================================================== | 95% 5 MB
|======================================================================= | 97% 5 MB
|========================================================================| 99% 5 MB
|=========================================================================| 100% 5 MB
recetas <- filtrado %>% flatten
recetas[1:10]
[[1]]
[1] "beef_broth" "egg" "soy_sauce" "soybean"
[[2]]
[1] "sesame_oil" "beef" "roasted_sesame_seed"
[4] "matsutake" "black_pepper" "scallion"
[7] "garlic" "soy_sauce"
[[3]]
[1] "vinegar" "roasted_sesame_seed" "cayenne"
[4] "scallion" "garlic" "soybean"
[7] "cucumber" "rice"
[[4]]
[1] "beef" "roasted_sesame_seed" "soy_sauce"
[4] "cayenne" "ginger" "scallion"
[7] "lettuce" "garlic" "vegetable"
[10] "sake"
[[5]]
[1] "garlic" "fish" "cayenne" "soy_sauce" "potato"
[[6]]
[1] "sweet_potato" "onion" "roasted_sesame_seed"
[4] "soy_sauce" "cayenne" "ginger"
[7] "soybean" "vegetable" "cabbage"
[10] "rice" "chicken" "sesame_oil"
[[7]]
[1] "sesame_oil" "radish" "fish"
[4] "black_pepper" "ginger" "garlic"
[7] "seaweed" "shrimp" "beef"
[10] "roasted_sesame_seed" "soy_sauce" "cayenne"
[13] "chinese_cabbage" "scallion" "sesame_seed"
[16] "rice"
[[8]]
[1] "vinegar" "radish" "fish" "cayenne" "scallion" "cucumber" "soybean"
[8] "vegetable" "garlic" "rice" "soy_sauce"
[[9]]
[1] "radish" "fish" "cayenne" "ginger" "scallion"
[6] "garlic" "vegetable_oil" "soy_sauce"
[[10]]
[1] "nut" "cucumber" "sesame_seed" "soybean"
library(arules)
length(recetas)
[1] 2512
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05, target = 'frequent itemsets',
ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 125
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
length(ap_recetas)
[1] 628
Vemos los items frecuentes
frecs <- ap_recetas %>% subset(size(.) == 1 ) %>% sort(by = 'support') %>%
DATAFRAME
DT::datatable(frecs %>% mutate_if(is.numeric, function(x) round(x, 3)))
Y ahora examinamos combinaciones frecuentes de distintos tamaños
ap_recetas %>%
subset(size(.) == 2) %>%
subset(support > 0.20) %>%
sort(by = 'support') %>%
inspect
Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:
ap_recetas %>%
subset(size(.) == 4) %>%
subset(support > 0.10) %>%
sort(by = 'support') %>%
inspect
pars <- list(support = 0.01, confidence = 0.10,
target = 'rules',
ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 25
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [88 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.02s].
writing ... [50181 rule(s)] done [0.02s].
creating S4 object ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
quality(reglas) <- cbind(quality(reglas),
hyper_lift = interestMeasure(reglas, measure = "hyperLift",
transactions = trans))
reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)
library(arulesViz)
Loading required package: grid
Registered S3 method overwritten by 'data.table':
method from
print.data.table
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
[1] 213
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 %>% sort(by = 'hyper_lift'))
plot(reglas_1 %>% subset(support > 0.2), engine = "plotly")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
library(tidygraph)
library(ggraph)
frecs <-
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% data.frame
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
set.seed(881)
ggraph(graph_1, layout = 'fr') +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
[1] 11068
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
length(reglas_tam_2)
[1] 132
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
`as_data_frame()` is deprecated as of tibble 2.0.0.
Please use `as_tibble()` instead.
The signature and semantics have changed, see `?as_tibble`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr', start.temp=100) +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
Exportamos para examinar en Gephi:
write_csv(df_reglas %>% rename(source=from, target=to) %>%
select(-count),
path='reglas.csv')
La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):
```r
df_reglas %>% filter(from == \{corn}\, to == \{starch}\)
<!-- rnb-source-end -->
<!-- rnb-frame-begin eyJtZXRhZGF0YSI6eyJjbGFzc2VzIjpbInRibF9kZiIsInRibCIsImRhdGEuZnJhbWUiXSwibnJvdyI6MSwibmNvbCI6OX0sInJkZiI6IlVrUllNd3BZQ2dBQUFBTUFCQUFEQUFNRkFBQUFBQVZWVkVZdE9BQUFCQUlBQUFBQkFBUUFDUUFBQUFGNEFBQURFd0FBQUFrQUFBTU5BQUFBQVFBQUFBc0FBQVFDQUFBQUFRQUVBQWtBQUFBR2JHVjJaV3h6QUFBQUVBQUFBRElBQkFBSkFBQUFDbnRpY21GemMybGpZWDBBQkFBSkFBQUFDSHRpWVhKc1pYbDlBQVFBQ1FBQUFBMTdhMkYwYzNWdlluVnphR2w5QUFRQUNRQUFBQWQ3WW5KbFlXUjlBQVFBQ1FBQUFBWjdiV2xzYTMwQUJBQUpBQUFBQ0h0M1lYTmhZbWw5QUFRQUNRQUFBQWw3YzJWaGQyVmxaSDBBQkFBSkFBQUFCM3Q1WldGemRIMEFCQUFKQUFBQUNIdHphR1Z5Y25sOUFBUUFDUUFBQUF4N1kyVnNaWEo1WDI5cGJIMEFCQUFKQUFBQUJudGpiM0p1ZlFBRUFBa0FBQUFJZTNSdmJXRjBiMzBBQkFBSkFBQUFDWHR0ZFhOMFlYSmtmUUFFQUFrQUFBQUtlMk4xWTNWdFltVnlmUUFFQUFrQUFBQUdlM0JsWVhKOUFBUUFDUUFBQUFWN2JuVjBmUUFFQUFrQUFBQUllM0poWkdsemFIMEFCQUFKQUFBQUQzdGphR2xqYTJWdVgySnliM1JvZlFBRUFBa0FBQUFHZTNkcGJtVjlBQVFBQ1FBQUFBbDdZMmhwWTJ0bGJuMEFCQUFKQUFBQUNIdHpkR0Z5WTJoOUFBUUFDUUFBQUFaN1ltVmhibjBBQkFBSkFBQUFDWHQyYVc1bFoyRnlmUUFFQUFrQUFBQUxlMlZ1YjJ0cFpHRnJaWDBBQkFBSkFBQUFCbnRyWld4d2ZRQUVBQWtBQUFBS2UyMTFjMmh5YjI5dGZRQUVBQWtBQUFBS2UzTm9hV2wwWVd0bGZRQUVBQWtBQUFBSGUyOXVhVzl1ZlFBRUFBa0FBQUFKZTJOaFltSmhaMlY5QUFRQUNRQUFBQVo3Y0c5eWEzMEFCQUFKQUFBQUJudGpiR0Z0ZlFBRUFBa0FBQUFIZTNOeGRXbGtmUUFFQUFrQUFBQUllM05vY21sdGNIMEFCQUFKQUFBQUNIdHdaWEJ3WlhKOUFBUUFDUUFBQUFoN2IzbHpkR1Z5ZlFBRUFBa0FBQUFHZTNOaGEyVjlBQVFBQ1FBQUFBWjdabWx6YUgwQUJBQUpBQUFBREh0aVpXVm1YMkp5YjNSb2ZRQUVBQWtBQUFBTmUySmxiR3hmY0dWd2NHVnlmUUFFQUFrQUFBQVJlMk5vYVc1bGMyVmZZMkZpWW1GblpYMEFCQUFKQUFBQUMzdDJaV2RsZEdGaWJHVjlBQVFBQ1FBQUFBaDdaMmx1WjJWeWZRQUVBQWtBQUFBSmUyTmhlV1Z1Ym1WOUFBUUFDUUFBQUFoN1kyRnljbTkwZlFBRUFBa0FBQUFHZTJKbFpXWjlBQVFBQ1FBQUFBZDdkMmhsWVhSOUFBUUFDUUFBQUFWN1pXZG5mUUFFQUFrQUFBQVZlM0p2WVhOMFpXUmZjMlZ6WVcxbFgzTmxaV1I5QUFRQUNRQUFBQXg3YzJWellXMWxYMjlwYkgwQUJBQUpBQUFBRG50aWJHRmphMTl3WlhCd1pYSjlBQUFFQWdBQUFBRUFCQUFKQUFBQUJXTnNZWE56QUFBQUVBQUFBQUVBQkFBSkFBQUFCbVpoWTNSdmNnQUFBUDRBQUFNTkFBQUFBUUFBQUJnQUFBUUNBQUFDL3dBQUFCQUFBQUF0QUFRQUNRQUFBQWw3WTJGNVpXNXVaWDBBQkFBSkFBQUFDWHR6YjNsaVpXRnVmUUFFQUFrQUFBQUdlM0pwWTJWOUFBUUFDUUFBQUFaN2MyRnJaWDBBQkFBSkFBQUFCWHRsWjJkOUFBUUFDUUFBQUFsN2MyVmhkMlZsWkgwQUJBQUpBQUFBQ0h0M1lYTmhZbWw5QUFRQUNRQUFBQWw3ZG1sdVpXZGhjbjBBQkFBSkFBQUFCM3R2Ym1sdmJuMEFCQUFKQUFBQUNIdG5hVzVuWlhKOUFBUUFDUUFBQUFaN1kyOXlibjBBQkFBSkFBQUFESHRqWld4bGNubGZiMmxzZlFBRUFBa0FBQUFJZTNSdmJXRjBiMzBBQkFBSkFBQUFDbnRqZFdOMWJXSmxjbjBBQkFBSkFBQUFDWHR0ZFhOMFlYSmtmUUFFQUFrQUFBQUllMk5oY25KdmRIMEFCQUFKQUFBQUJYdHVkWFI5QUFRQUNRQUFBQVo3Y0dWaGNuMEFCQUFKQUFBQUNIdHlZV1JwYzJoOUFBUUFDUUFBQUFaN1ltVmxabjBBQkFBSkFBQUFCbnQzYVc1bGZRQUVBQWtBQUFBUGUyTm9hV05yWlc1ZlluSnZkR2g5QUFRQUNRQUFBQWw3WTJocFkydGxibjBBQkFBSkFBQUFDSHR6ZEdGeVkyaDlBQVFBQ1FBQUFBWjdZbVZoYm4wQUJBQUpBQUFBQm50clpXeHdmUUFFQUFrQUFBQUxlMlZ1YjJ0cFpHRnJaWDBBQkFBSkFBQUFDbnR0ZFhOb2NtOXZiWDBBQkFBSkFBQUFDbnR6YUdscGRHRnJaWDBBQkFBSkFBQUFCbnR3YjNKcmZRQUVBQWtBQUFBSmUyTmhZbUpoWjJWOUFBUUFDUUFBQUFkN2MzRjFhV1I5QUFRQUNRQUFBQVo3WTJ4aGJYMEFCQUFKQUFBQUNIdHphSEpwYlhCOUFBUUFDUUFBQUFoN2NHVndjR1Z5ZlFBRUFBa0FBQUFJZTI5NWMzUmxjbjBBQkFBSkFBQUFCbnRtYVhOb2ZRQUVBQWtBQUFBTmUySmxiR3hmY0dWd2NHVnlmUUFFQUFrQUFBQU1lMkpsWldaZlluSnZkR2g5QUFRQUNRQUFBQkY3WTJocGJtVnpaVjlqWVdKaVlXZGxmUUFFQUFrQUFBQUxlM1psWjJWMFlXSnNaWDBBQkFBSkFBQUFCM3QzYUdWaGRIMEFCQUFKQUFBQURIdHpaWE5oYldWZmIybHNmUUFFQUFrQUFBQVZlM0p2WVhOMFpXUmZjMlZ6WVcxbFgzTmxaV1I5QUFRQUNRQUFBQTU3WW14aFkydGZjR1Z3Y0dWeWZRQUFCQUlBQUFQL0FBQUFFQUFBQUFFQUJBQUpBQUFBQm1aaFkzUnZjZ0FBQVA0QUFBQU9BQUFBQVQrTldhNTRxWlJpQUFBQURnQUFBQUUvMmk2TG91aTZMd0FBQUE0QUFBQUJQNkh2c2J1RUU1RUFBQUFPQUFBQUFVQUh3cWduMGJrK0FBQUFEUUFBQUFFQUFBQWtBQUFBRGdBQUFBRS8vTXpNek16TXpRQUFBQTRBQUFBQlArTFBKZnJZOGNRQUFBUUNBQUFBQVFBRUFBa0FBQUFGYm1GdFpYTUFBQUFRQUFBQUNRQUVBQWtBQUFBRVpuSnZiUUFFQUFrQUFBQUNkRzhBQkFBSkFBQUFCM04xY0hCdmNuUUFCQUFKQUFBQUNtTnZibVpwWkdWdVkyVUFCQUFKQUFBQUNHTnZkbVZ5WVdkbEFBUUFDUUFBQUFSc2FXWjBBQVFBQ1FBQUFBVmpiM1Z1ZEFBRUFBa0FBQUFLYUhsd1pYSmZiR2xtZEFBRUFBa0FBQUFHZDJWcFoyaDBBQUFFQWdBQUFBRUFCQUFKQUFBQUNYSnZkeTV1WVcxbGN3QUFBQTBBQUFBQ2dBQUFBUC8vLy84QUFBUUNBQUFEL3dBQUFCQUFBQUFCQUFRQUNRQUFBQXBrWVhSaExtWnlZVzFsQUFBQS9nQUFCQUlBQUFBQkFBUUFDUUFBQUFkdmNIUnBiMjV6QUFBQ0V3QUFBQVVBQUFBUUFBQUFBUUFFQUFrQUFBQUJjZ0FBQUJBQUFBQUJBQVFBQ1FBQUFCQjFibTVoYldWa0xXTm9kVzVyTFRFMEFBQUFDZ0FBQUFFQUFBQUFBQUFBRFFBQUFBRUFBQUFCQUFBQURRQUFBQUVBQUFBSkFBQUVBZ0FBQlA4QUFBQVFBQUFBQlFBRUFBa0FBQUFHWlc1bmFXNWxBQVFBQ1FBQUFBVnNZV0psYkFBRUFBa0FBQUFPY205M2JtRnRaWE11Y0hKcGJuUUFCQUFKQUFBQUNuSnZkM011ZEc5MFlXd0FCQUFKQUFBQUNtTnZiSE11ZEc5MFlXd0FBQUQrQUFBQS9nPT0ifQ== -->
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["from"],"name":[1],"type":["fctr"],"align":["left"]},{"label":["to"],"name":[2],"type":["fctr"],"align":["left"]},{"label":["support"],"name":[3],"type":["dbl"],"align":["right"]},{"label":["confidence"],"name":[4],"type":["dbl"],"align":["right"]},{"label":["coverage"],"name":[5],"type":["dbl"],"align":["right"]},{"label":["lift"],"name":[6],"type":["dbl"],"align":["right"]},{"label":["count"],"name":[7],"type":["int"],"align":["right"]},{"label":["hyper_lift"],"name":[8],"type":["dbl"],"align":["right"]},{"label":["weight"],"name":[9],"type":["dbl"],"align":["right"]}],"data":[{"1":"{corn}","2":"{starch}","3":"0.01433121","4":"0.4090909","5":"0.03503185","6":"2.970047","7":"36","8":"1.8","9":"0.5877867"}],"options":{"columns":{"min":{},"max":[10],"total":[9]},"rows":{"min":[10],"max":[10],"total":[1]},"pages":{}}}
</script>
</div>
<!-- rnb-frame-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxucmVjZXRhcyAlPiUga2VlcCh+IFwidG9tYXRvXCIgJWluJSAueCAmIFwiY29yblwiICVpbiUgLngpICU+JSBoZWFkKDEwKVxuYGBgIn0= -->
```r
recetas %>% keep(~ "tomato" %in% .x & "corn" %in% .x) %>% head(10)
[[1]]
[1] "tomato" "vinegar" "pork" "celery_oil" "leek"
[6] "corn" "black_pepper" "pepper" "ginger" "pea"
[11] "garlic" "soybean" "soy_sauce" "chicken_broth" "wine"
[[2]]
[1] "tomato" "vinegar" "pepper" "celery_oil" "corn" "cayenne"
[7] "pork" "garlic" "soybean" "vegetable" "coriander" "rice"
[13] "soy_sauce"
[[3]]
[1] "tomato" "vinegar" "pork" "celery_oil" "soy_sauce" "ginger"
[7] "garlic" "sherry" "corn"
[[4]]
[1] "pepper" "celery_oil" "starch" "corn" "ginger"
[6] "garlic" "soybean" "tomato" "vinegar" "beef"
[11] "soy_sauce" "cayenne" "scallion" "bell_pepper" "vegetable_oil"
[16] "rice" "wine"
[[5]]
[1] "tomato" "vinegar" "pork" "celery_oil" "beef" "soy_sauce"
[7] "ginger" "garlic" "corn" "wine"
[[6]]
[1] "tomato" "vinegar" "pepper" "lemon_juice" "celery_oil" "sake"
[7] "corn" "pork" "ginger" "honey" "garlic" "soybean"
[13] "rice" "soy_sauce"
[[7]]
[1] "tomato" "garlic" "onion" "bacon" "corn" "cayenne" "egg"
[[8]]
[1] "pork" "green_bell_pepper" "celery_oil" "starch"
[5] "corn" "garlic" "tomato" "vinegar"
[9] "onion" "soy_sauce" "cider" "scallion"
[13] "celery" "pineapple" "vegetable_oil" "egg"
[[9]]
[1] "tomato" "vinegar" "pepper" "celery_oil" "roasted_pork"
[6] "soy_sauce" "ginger" "honey" "garlic" "cinnamon"
[11] "soybean" "sherry" "corn" "oyster"
[[10]]
[1] "cane_molasses" "tomato" "pork" "celery_oil" "vinegar"
[6] "soy_sauce" "pepper" "ginger" "garlic" "corn"